beerData<-read_csv("./Data/Beers.csv")
## Parsed with column specification:
## cols(
##   Name = col_character(),
##   Beer_ID = col_integer(),
##   ABV = col_double(),
##   IBU = col_integer(),
##   Brewery_id = col_integer(),
##   Style = col_character(),
##   Ounces = col_double()
## )
breweryData<-read_csv("./Data/Breweries.csv")
## Parsed with column specification:
## cols(
##   Brew_ID = col_integer(),
##   Name = col_character(),
##   City = col_character(),
##   State = col_character()
## )
###b. What are the dimensions of the data?

glimpse(beerData)
## Observations: 2,410
## Variables: 7
## $ Name       <chr> "Pub Beer", "Devil's Cup", "Rise of the Phoenix", "Si…
## $ Beer_ID    <int> 1436, 2265, 2264, 2263, 2262, 2261, 2260, 2259, 2258,…
## $ ABV        <dbl> 0.050, 0.066, 0.071, 0.090, 0.075, 0.077, 0.045, 0.06…
## $ IBU        <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Brewery_id <int> 409, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178…
## $ Style      <chr> "American Pale Lager", "American Pale Ale (APA)", "Am…
## $ Ounces     <dbl> 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0,…
summary(beerData)
##      Name              Beer_ID            ABV               IBU        
##  Length:2410        Min.   :   1.0   Min.   :0.00100   Min.   :  4.00  
##  Class :character   1st Qu.: 808.2   1st Qu.:0.05000   1st Qu.: 21.00  
##  Mode  :character   Median :1453.5   Median :0.05600   Median : 35.00  
##                     Mean   :1431.1   Mean   :0.05977   Mean   : 42.71  
##                     3rd Qu.:2075.8   3rd Qu.:0.06700   3rd Qu.: 64.00  
##                     Max.   :2692.0   Max.   :0.12800   Max.   :138.00  
##                                      NA's   :62        NA's   :1005    
##    Brewery_id       Style               Ounces     
##  Min.   :  1.0   Length:2410        Min.   : 8.40  
##  1st Qu.: 94.0   Class :character   1st Qu.:12.00  
##  Median :206.0   Mode  :character   Median :12.00  
##  Mean   :232.7                      Mean   :13.59  
##  3rd Qu.:367.0                      3rd Qu.:16.00  
##  Max.   :558.0                      Max.   :32.00  
## 
#Name              Beer_ID            ABV               IBU           Brewery_id       Style               Ounces 

#beer summarized by style 
beerData %>%
  group_by(Style) %>%
  summarise(
    count= n(),
    IBU_count = sum(!is.na(IBU)),
    IBU_average = mean(IBU, na.rm = TRUE),
    IBU_sd = sd(IBU, na.rm = TRUE),
    ABV_count = sum(!is.na(ABV)),
    ABV_average = mean(ABV, na.rm = TRUE),
    ABV_sd = sd(ABV, na.rm = TRUE),
    Ounces_count = sum(!is.na(Ounces)),
    Ounces_average = mean(Ounces, na.rm = TRUE),
    Ounces_sd = sd(Ounces, na.rm = TRUE)
  )
## # A tibble: 100 x 11
##    Style count IBU_count IBU_average IBU_sd ABV_count ABV_average  ABV_sd
##    <chr> <int>     <int>       <dbl>  <dbl>     <int>       <dbl>   <dbl>
##  1 <NA>      5         2        24     4.24         2      0.0565 0.00495
##  2 Abbe…     2         2        22     0            2      0.049  0      
##  3 Altb…    13         8        34.1   8.34        13      0.0544 0.00690
##  4 Amer…    18        11        11     3.77        18      0.0487 0.00431
##  5 Amer…   133        77        36.3  19.6        125      0.0575 0.00868
##  6 Amer…    29        16        23.2   9.39        28      0.0495 0.00464
##  7 Amer…     3         2        96     5.66         3      0.099  0      
##  8 Amer…    36        20        68.9  21.4         35      0.0690 0.0126 
##  9 Amer…   108        61        21.0   7.28       104      0.0496 0.00494
## 10 Amer…    70        38        29.9  12.3         68      0.0579 0.00971
## # … with 90 more rows, and 3 more variables: Ounces_count <int>,
## #   Ounces_average <dbl>, Ounces_sd <dbl>
#Percentage of missing values in dataset
paste0("The total number of beers is: ", nrow(beerData))
## [1] "The total number of beers is: 2410"
percent_missing<-beerData %>% 
  summarise_all(funs(100*sum(is.na(.))/nrow(beerData))) %>% #count the number of NAs or blanks divided by total rows
  round(2) %>% 
  lapply(function(x){paste0(x,"%")}) %>%
  as_tibble()
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
##   # Before:
##   funs(name = f(.))
## 
##   # After: 
##   list(name = ~ f(.))
## This warning is displayed once per session.
missing_info <- paste('  ABV >', percent_missing$ABV, '  IBU >', percent_missing$IBU, '  Style >', percent_missing$Style)
# ABV, IBU, Style have missing values 

#hmm what should be done about the missing values...
glimpse(breweryData)
## Observations: 558
## Variables: 4
## $ Brew_ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ Name    <chr> "NorthGate Brewing", "Against the Grain Brewery", "Jack'…
## $ City    <chr> "Minneapolis", "Louisville", "Framingham", "San Diego", …
## $ State   <chr> "MN", "KY", "MA", "CA", "CA", "SC", "CO", "MI", "MI", "M…
summary(breweryData)
##     Brew_ID          Name               City              State          
##  Min.   :  1.0   Length:558         Length:558         Length:558        
##  1st Qu.:140.2   Class :character   Class :character   Class :character  
##  Median :279.5   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :279.5                                                           
##  3rd Qu.:418.8                                                           
##  Max.   :558.0
#    Brew_ID          Name               City              State          

#Report the number of NA's in each column.
breweryData %>% 
  summarise_all(funs(sum(is.na(.))))#count the number of NAs or blanks
## # A tibble: 1 x 4
##   Brew_ID  Name  City State
##     <int> <int> <int> <int>
## 1       0     0     0     0
#no missing values 
library("httr")
library("jsonlite")
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
#2018 projected population data
pop_url <- "https://api.census.gov/data/2018/pep/population?get=POP,GEONAME&for=state"

statepop_raw<- pop_url %>% 
  GET() %>% 
  content("text") %>%
  fromJSON(flatten=TRUE) %>% 
  as_tibble() %>% 
  slice(2:n())#remove the first row which is the labels 
## Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
## This warning is displayed once per session.
names(statepop_raw)<-c("Population","State","State_ID")

statepop_raw$Population<- as.integer(statepop_raw$Population)

#add state abbreviations and DC/puerto rico
statepop_raw$State_Abb <- c(sort(c("DC",state.abb)),"PR")
library(rvest)
## Warning: package 'rvest' was built under R version 3.5.2
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
area_url <-"https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_area"

statearea<- area_url %>% 
  read_html() %>% 
  html_node('#mw-content-text > div > table:nth-child(6)') %>% 
  html_table(header = F) %>% 
  slice(3:n()) %>% #remove the two rows which are labels
  select(X1,X6) %>% #keep the state area excluding water
  rename("State_name"="X1","Square_miles"="X6") %>% 
  mutate(Square_miles=as.numeric(gsub(",", "", Square_miles))) #remove thousands separator ","

Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.

beer_brewery<-beerData %>%
  left_join(breweryData, by=c("Brewery_id"="Brew_ID"), suffix = c(".beer", ".brewery")) %>% 
  rename(Beer_name=Name.beer, Brewery_name=Name.brewery) %>% 
  #separate continuous variables into 3 groups, low/medium/high
  mutate(ABV_rating= cut_number(ABV, n = 3,labels=c("low","medium","high"))) %>% 
  mutate(IBU_rating= cut_number(IBU, n = 3,labels=c("low","medium","high"))) 
  
beer_brewery %>% 
  filter(is.na(Brewery_name)) 
## # A tibble: 0 x 12
## # … with 12 variables: Beer_name <chr>, Beer_ID <int>, ABV <dbl>,
## #   IBU <int>, Brewery_id <int>, Style <chr>, Ounces <dbl>,
## #   Brewery_name <chr>, City <chr>, State <chr>, ABV_rating <fct>,
## #   IBU_rating <fct>
#returns no results so all beers have a brewery

#Post merge data peak 
first_combo_brewery<-head(beer_brewery,6)
last_combo_brewery<-tail(beer_brewery,6)
## Warning: package 'tidystringdist' was built under R version 3.5.2
## Warning in do_dist(a = b, b = a, method = method, weight = weight, q =
## q, : Non-printable ascii or non-ascii characters in soundex. Results may be
## unreliable. See ?printable_ascii.
## # A tibble: 2,653,056 x 12
##    V1    V2      osa    lv    dl hamming   lcs qgram cosine jaccard     jw
##    <chr> <chr> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>  <dbl>   <dbl>  <dbl>
##  1 Brew… Brew…     1     1     1       1     2     2 0.0147   0.105 0.0238
##  2 Boon… Boon…     1     1     1       1     2     2 0.0278   0     0.0303
##  3 Broo… Broo…     1     1     1       1     2     2 0.0270   0.105 0.0290
##  4 Cald… Cald…     1     1     1       1     2     2 0.0417   0.125 0.0370
##  5 Cald… Cald…     1     1     1       1     2     2 0.0204   0     0.0290
##  6 Cald… Cald…     1     1     1       1     2     2 0.0204   0.133 0.0290
##  7 Cald… Cald…     1     1     1       1     2     2 0.0204   0.133 0.0290
##  8 Cald… Cald…     1     1     1       1     2     2 0.0204   0.133 0.0290
##  9 Indi… Indi…     1     1     1       1     2     2 0.0426   0     0.137 
## 10 Vert… Vort…     1     1     1       1     2     2 0.0871   0.100 0.122 
## # … with 2,653,046 more rows, and 1 more variable: soundex <dbl>
## Warning in do_dist(a = b, b = a, method = method, weight = weight, q =
## q, : Non-printable ascii or non-ascii characters in soundex. Results may be
## unreliable. See ?printable_ascii.
## # A tibble: 4,950 x 12
##    V1    V2      osa    lv    dl hamming   lcs qgram cosine jaccard    jw
##    <chr> <chr> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>  <dbl>   <dbl> <dbl>
##  1 Altb… Witb…     2     2     2       2     4     4 0.244    0.375 0.190
##  2 Cider Other     3     3     3       3     6     6 0.6      0.75  0.4  
##  3 Cider Radl…     3     3     3     Inf     5     5 0.452    0.625 0.3  
##  4 Belg… Belg…     3     3     3       3     6     6 0.0979   0.308 0.176
##  5 Amer… Amer…     3     3     3       3     6     6 0.0607   0.188 0.170
##  6 Belg… Belg…     3     3     3       3     6     6 0.0660   0.188 0.120
##  7 Gose  Bock      3     3     3       3     6     6 0.75     0.857 0.5  
##  8 Crea… Whea…     3     3     3       3     6     6 0.273    0.545 0.222
##  9 Euro… Euro…     3     3     3       3     6     6 0.124    0.308 0.203
## 10 Engl… Engl…     3     3     3       3     6     6 0.0720   0.235 0.179
## # … with 4,940 more rows, and 1 more variable: soundex <dbl>
## Warning in do_dist(a = b, b = a, method = method, weight = weight, q =
## q, : Non-printable ascii or non-ascii characters in soundex. Results may be
## unreliable. See ?printable_ascii.
## # A tibble: 151,525 x 12
##    V1    V2      osa    lv    dl hamming   lcs qgram  cosine jaccard     jw
##    <chr> <chr> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl>   <dbl>  <dbl>
##  1 Agai… Agai…     1     1     1       1     2     2 0.0198   0.0625 0.0267
##  2 Rive… Rive…     1     1     1     Inf     1     1 0.00949  0      0.0395
##  3 Ashe… Aspe…     2     2     2       2     4     4 0.0626   0.0588 0.178 
##  4 Asla… Aspe…     2     2     2       2     4     4 0.0620   0.0588 0.107 
##  5 Aver… Lave…     2     2     2     Inf     3     3 0.0451   0.118  0.0462
##  6 Fate… Matt…     2     2     2       2     4     4 0.0714   0.118  0.104 
##  7 Kalo… Kona…     2     2     2     Inf     2     2 0.0250   0.0625 0.0553
##  8 Kona… NoDa…     2     2     2       2     4     4 0.0646   0.176  0.215 
##  9 Matt… Maui…     2     2     2       2     4     4 0.107    0.118  0.122 
## 10 Maui… Miam…     2     2     2     Inf     3     3 0.0417   0.0625 0.119 
## # … with 151,515 more rows, and 1 more variable: soundex <dbl>

Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.

#Summary data by State 
beer_brewery_byState<-beer_brewery %>%
  group_by(State) %>%
  summarise(
    Number_of_beers= n(),
    Number_of_breweries=length(unique(Brewery_name)),
    Number_of_styles=length(unique(Style)),
    IBU_count = sum(!is.na(IBU)),
    IBU_average = mean(IBU, na.rm = TRUE),
    IBU_median = median(IBU, na.rm = TRUE),
    IBU_sd = sd(IBU, na.rm = TRUE),
    ABV_count = sum(!is.na(ABV)),
    ABV_average = mean(ABV, na.rm = TRUE),
    ABV_median = median(ABV, na.rm = TRUE),
    ABV_sd = sd(ABV, na.rm = TRUE),
    Ounces_count = sum(!is.na(Ounces)),
    Ounces_average = mean(Ounces, na.rm = TRUE),
    Ounces_median = median(Ounces, na.rm = TRUE),
    Ounces_sd = sd(Ounces, na.rm = TRUE),
  )

#add counts for Ounces
beer_brewery_byState<-beer_brewery %>%
  filter(!is.na(Ounces)) %>% 
  count(State,Ounces) %>% 
  spread(Ounces,n, fill=0,sep = "_") %>% 
  right_join(beer_brewery_byState, by="State")

#add category counts for ABV_rating
beer_brewery_byState<-beer_brewery %>%
  filter(!is.na(ABV_rating)) %>% 
  count(State,ABV_rating) %>% 
  spread(ABV_rating,n, fill=0,sep = "_") %>%
  right_join(beer_brewery_byState, by="State")

#add category counts for IBU_rating
beer_brewery_byState<-beer_brewery %>%
  filter(!is.na(IBU_rating)) %>% 
  count(State,IBU_rating) %>% 
  spread(IBU_rating,n, fill=0,sep = "_") %>% 
  right_join(beer_brewery_byState, by="State") 

#add in population values
beer_brewery_byState<-beer_brewery_byState %>% 
  left_join(y=statepop_raw, by=c("State"="State_Abb")) %>% 
  select(c(-State_ID)) %>% 
  rename(State_name=State.y) %>% 
  mutate(breweries_per_million_person=(1000000*Number_of_breweries/Population))

#add in state area in Square_miles
beer_brewery_byState<-beer_brewery_byState %>% 
  left_join(y=statearea, by="State_name") %>% 
  mutate(breweries_per_thousand_sqmiles=(1000*Number_of_breweries/Square_miles))

#add ranks
beer_brewery_byState<-beer_brewery_byState %>% 
  mutate(ABV_rank=rank(desc(ABV_median)),
         ABV_sd_rank=rank(desc(ABV_sd)),
         IBU_rank=rank(desc(IBU_median)),
         IBU_sd_rank=rank(desc(IBU_sd)),
         Ounce_rank=rank(desc(Ounces_median)), 
         Ounce_sd=rank(desc(Ounces_sd))
         )

How many breweries in each state

brewstate<-beer_brewery_byState %>% 
  arrange(desc(Number_of_breweries)) %>% 
  select(State_name, Number_of_breweries, Number_of_beers, Number_of_styles) %>% 
  rename_all(funs(str_replace_all(., "_", " ")))

top_brewstate <- head(brewstate,10) 
bottom_brewstate <- tail(brewstate,10)

top_brewstate
## # A tibble: 10 x 4
##    `State name` `Number of breweries` `Number of beers` `Number of styles`
##    <chr>                        <int>             <int>              <int>
##  1 Colorado                        46               265                 61
##  2 California                      39               183                 46
##  3 Michigan                        32               162                 46
##  4 Oregon                          29               125                 29
##  5 Texas                           28               130                 46
##  6 Pennsylvania                    24               100                 42
##  7 Maine                           23                82                 33
##  8 Washington                      23                68                 23
##  9 Iowa                            22               139                 47
## 10 Nebraska                        19                59                 25
bottom_brewstate
## # A tibble: 10 x 4
##    `State name`      `Number of breweri… `Number of beer… `Number of style…
##    <chr>                           <int>            <int>             <int>
##  1 New Mexico                          3                8                 6
##  2 Tennessee                           3                6                 5
##  3 Arizona                             2                5                 5
##  4 District of Colu…                   2                2                 2
##  5 Missouri                            2               11                 8
##  6 North Carolina                      2               11                10
##  7 Delaware                            1                8                 8
##  8 Nevada                              1                3                 3
##  9 South Dakota                        1                7                 7
## 10 Wisconsin                           1                2                 2
states <- map_data("state")

map_data<-beer_brewery_byState %>% 
  mutate(State_name=tolower(State_name)) %>%  #to match the state mapping data
  right_join(states, by=c("State_name"="region"))

state_base <- ggplot(data = map_data, mapping = aes(x = long, y = lat, group = group)) +
      geom_polygon(color = "black", fill = "gray") + 
      theme_void()
state_base +
      geom_polygon(data = map_data, aes(fill = Population)) #TODO(BJH) think about breaking into subregions 

library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
#Example on how to arrange multiple plots 


#MISC 
state_base + geom_polygon(data = map_data, aes(fill = Number_of_styles)) +
      scale_fill_viridis(option="D")

state_base + geom_polygon(data = map_data, aes(fill = Ounces_median)) +
      scale_fill_viridis(option="D")

#General Statistics
grid.arrange(
    state_base + geom_polygon(data = map_data, aes(fill = Number_of_breweries)) +
      scale_fill_viridis(option="D")+ labs(fill = "# of Breweries"),
    state_base + geom_polygon(data = map_data, aes(fill = Number_of_beers)) +
      scale_fill_viridis(option="D")+ labs(fill = "# of Beers"),
    state_base + geom_polygon(data = map_data, aes(fill = breweries_per_million_person)) +
      scale_fill_viridis(option="D")+ labs(fill = "Breweries/Mil People"),
    state_base + geom_polygon(data = map_data, aes(fill = Population/1000000))  + labs(fill = "Population (millions)") + scale_fill_viridis(option="D"),
    ncol=2, nrow=2
) %>% 
  ggsave(filename='Export/general_statistics_map.jpg')
## Saving 7 x 5 in image

#ABV data mapped 
grid.arrange(
  state_base + geom_polygon(data = map_data, aes(fill = ABV_rating_high)) + scale_fill_viridis(option="D") +
    labs(fill = "High ABV Beers"), 
  # state_base + geom_polygon(data = map_data, aes(fill = ABV_rating_medium)) + scale_fill_viridis(option="D"), 
  state_base + geom_polygon(data = map_data, aes(fill = ABV_rating_low))+ scale_fill_viridis(option="D") + 
    labs(fill = "Low ABV Beers"),  
  state_base + geom_polygon(data = map_data, aes(fill = ABV_sd)) + scale_fill_viridis(option="D") + 
        labs(fill = "ABV Standard Deviation"),
  state_base + geom_polygon(data = map_data, aes(fill = ABV_median)) + scale_fill_viridis(option="D")+ 
        labs(fill = "ABV Median"),
    ncol=2, nrow=2
) %>% 
  ggsave(filename='Export/ABV_beers_map.jpg')
## Saving 7 x 5 in image

#IBU data mapped 
grid.arrange(
  state_base + geom_polygon(data = map_data, aes(fill = IBU_rating_high)) + scale_fill_viridis(option="D") +
    labs(fill = "High IBU Beers"), 
  # state_base + geom_polygon(data = map_data, aes(fill = IBU_rating_medium)) + scale_fill_viridis(option="D"), 
  state_base + geom_polygon(data = map_data, aes(fill = IBU_rating_low))+ scale_fill_viridis(option="D") + 
    labs(fill = "Low IBU Beers"),  
  state_base + geom_polygon(data = map_data, aes(fill = IBU_sd)) + scale_fill_viridis(option="D") + 
        labs(fill = "IBU Std. Dev."),
  state_base + geom_polygon(data = map_data, aes(fill = IBU_median)) + scale_fill_viridis(option="D")+ 
        labs(fill = "IBU Median"),
    ncol=2, nrow=2
) %>% 
  ggsave(filename='Export/IBU_beers_map.jpg')
## Saving 7 x 5 in image

#MAPPING ABV BY STATE
grid.arrange(
  #HIGH ABV Beers per million people
  state_base +
      geom_polygon(data = map_data, aes(fill = ABV_rating_high)) + 
      scale_fill_continuous(name = "High ABV Beers"), 
  #MEDIUM ABV Beers per million people
  # state_base +
  #     geom_polygon(data = map_data, aes(fill = ABV_rating_medium)) + 
  #     scale_fill_continuous(name = "Medium ABV Beers"),  
  #LOW ABV Beers per million people
  state_base +
      geom_polygon(data = map_data, aes(fill = ABV_rating_low)) + 
    scale_fill_continuous(name = "Low ABV Beers"),
  #HIGH ABV Beers per million people
  state_base +
      geom_polygon(data = map_data, aes(fill = (1000000*ABV_rating_high)/Population))+
      scale_fill_continuous(name = "High ABV per mil."),
  #MEDIUM ABV Beers per million people
  # state_base +
  #     geom_polygon(data = map_data, aes(fill = (1000000*ABV_rating_medium)/Population))+
  #     scale_fill_continuous(name = "Medium ABV beers per million people"),
  #LOW ABV Beers per million people
  state_base +
      geom_polygon(data = map_data, aes(fill = (1000000*ABV_rating_low)/Population))+
      scale_fill_continuous(name = "Low ABV beers per mil.")  
  ,ncol=2, nrow=2
) %>% 
  ggsave(filename = "Export/ABV_by_state.jpg")
## Saving 7 x 5 in image

state_base +
      geom_polygon(data = map_data, aes(fill = ABV_sd))

ggsave('Export/ABV_SD_data.jpg')
## Saving 7 x 5 in image
state_base +
      geom_polygon(data = map_data, aes(fill = IBU_rating_high))

ggsave('Export/ibu_high_data.jpg')
## Saving 7 x 5 in image
state_base +
      geom_polygon(data = map_data, aes(fill = IBU_rating_medium))

ggsave('Export/ibu_med_data.jpg')
## Saving 7 x 5 in image
state_base +
      geom_polygon(data = map_data, aes(fill = IBU_rating_low))

ggsave('Export/ibu_low_data.jpg')
## Saving 7 x 5 in image
state_base +
      geom_polygon(data = map_data, aes(fill = IBU_sd))

ggsave('Export/ibu_SD_data.jpg')
## Saving 7 x 5 in image
#top 10 states for having high IBU beers
top_ibu<-beer_brewery_byState %>% 
  arrange(desc(IBU_rating_high)) %>% 
  head(10)

#bottom 10 states for having high IBU beers
bottom_ibu<-beer_brewery_byState %>% 
  arrange(desc(IBU_rating_high)) %>% 
  tail(10)

#barplot of median IBU by state, maybe do the top 10 and bottom 10 becuase it's crowded
top_ibu<-beer_brewery_byState %>%
  arrange(IBU_median) %>% 
  mutate(State=factor(State, levels=State)) %>%
  head(10) %>% 
  ggplot(aes(x=State, y=IBU_median)) + geom_col()+ 
  labs(title="States with Highest IBU", 
       subtitle="", y="Median IBU")

ggsave('Export/Hi_median_alch.jpg')
## Saving 7 x 5 in image
beer_brewery_byState %>%
  arrange(desc(IBU_median)) %>% 
  mutate(State=factor(State, levels=State)) %>%
  head(10) %>% 
  ggplot(aes(x=State, y=IBU_median)) + geom_col() + 
  labs(title="States with Lowest IBU", 
       y="IBU")

ggsave('Export/Low_median_alch.jpg')
## Saving 7 x 5 in image
#barplot of median ABV by state, maybe do the top 10 and bottom 10 because it's crowded
beer_brewery_byState %>%
  arrange(ABV_median) %>% #lowest to highest
  mutate(State=factor(State, levels=State)) %>%# To set the order appropriately for the plot
  head(10) %>% 
  ggplot(aes(x=State, y=ABV_median)) + geom_col() + 
  coord_flip()+ 
  labs(title="States with Lowest ABV", 
       y="ABV")

ggsave('Export/bottom_abv_summary.jpg')
## Saving 7 x 5 in image
beer_brewery_byState %>%
  arrange(desc(ABV_median)) %>% #highest to lowers 
  mutate(State=factor(State, levels=State)) %>% # To set the order appropriately for the plot
  head(10) %>% 
  ggplot(aes(x=State, y=ABV_median)) + geom_col() + 
  coord_flip() + 
  labs(title="States with Highest ABV", 
       y="ABV")

ggsave('Export/top_abv_summary.jpg')
## Saving 7 x 5 in image
#BASIC POSITIVE CORRELATION 

beer_brewery %>% 
  ggplot(aes(x=IBU, y=ABV)) + geom_point()+ 
    labs(title="ABV and IBU Scatterplot")
## Warning: Removed 1005 rows containing missing values (geom_point).

ggsave('Export/ibu_abv_scat_before.jpg')
## Saving 7 x 5 in image
## Warning: Removed 1005 rows containing missing values (geom_point).
#WITH MARKER INDICATING MARKET OPPORTUNITY
beer_brewery %>% 
  ggplot(aes(x=IBU, y=ABV)) + 
  geom_point() + 
  annotate("rect", xmin=c(50), xmax=c(120), ymin=c(.035) , ymax=c(.06), alpha=0.2, color="blue", fill="blue") + 
  annotate("text", label="Opportunity", x=c(70), y=c(.04), color="Blue")+ 
    labs(title="ABV and IBU Scatterplot", 
       subtitle="Market Opportunity")
## Warning: Removed 1005 rows containing missing values (geom_point).

ggsave('Export/ibu_abv_scat.jpg')
## Saving 7 x 5 in image
## Warning: Removed 1005 rows containing missing values (geom_point).
  beer_brewery %>% 
    ggplot(aes(x=Ounces, y=IBU)) + geom_point(position = "jitter")+ 
      labs(title="IBU and OUNCES Scatterplot") 
## Warning: Removed 1005 rows containing missing values (geom_point).

  ggsave(filename="Export/ounce_IBU.jpg")
## Saving 7 x 5 in image
## Warning: Removed 1005 rows containing missing values (geom_point).
 beer_brewery %>% 
  ggplot(aes(x=as.factor(Ounces), y=IBU)) + geom_point(position = "jitter")+ 
    labs(title="IBU and OUNCES Scatterplot", x="Ounces")
## Warning: Removed 1005 rows containing missing values (geom_point).

    ggsave(filename="Export/ounce_ABV.jpg")
## Saving 7 x 5 in image
## Warning: Removed 1005 rows containing missing values (geom_point).
 beer_brewery %>% 
  ggplot(aes(x=as.factor(Ounces), y=ABV)) + geom_point(position = "jitter")+ 
    labs(title="ABV and OUNCES Scatterplot", x="Ounces")
## Warning: Removed 62 rows containing missing values (geom_point).

Which state has the maximum alcoholic (ABV) beer?

#top 10 states for having high ABV beers
top_abv<-beer_brewery_byState %>% 
  arrange(desc(ABV_rating_high)) %>% 
  rename_all(funs(str_replace_all(., "_", " "))) %>% 
  head(10)

Which state has highest variance in ABV?

#top 10 states with the most variance in ABV
top_abv_sd<-beer_brewery_byState %>% 
  arrange(desc(ABV_sd)) %>% 
  rename_all(funs(str_replace_all(., "_", " "))) %>% 
  head(10)

#top 10 states with the least variance in ABV
bottom_abv_sd<-beer_brewery_byState %>% 
  arrange(ABV_sd) %>% 
  rename_all(funs(str_replace_all(., "_", " "))) %>% 
  head(10)

Which states have the most and least variance in beers?

#top 10 states with the most variance in ABV
top_abv_sd<-beer_brewery_byState %>% 
  arrange(desc(ABV_sd)) %>% 
  rename_all(funs(str_replace_all(., "_", " "))) %>% 
  head(10)

#top 10 states with the least variance in ABV
bottom_abv_sd<-beer_brewery_byState %>% 
  arrange(ABV_sd) %>% 
  rename_all(funs(str_replace_all(., "_", " "))) %>% 
  head(10)
#top 10 states for having high IBU beers
top_ibu<-beer_brewery_byState %>% 
  arrange(desc(IBU_median)) %>% 
  select(State, IBU_sd, IBU_sd_rank, IBU_median, IBU_rank, ABV_median, ABV_rank) %>% 
  rename_all(funs(str_replace_all(., "_", " "))) %>% 
  head(10)

top_ibu
## # A tibble: 10 x 7
##    State `IBU sd` `IBU sd rank` `IBU median` `IBU rank` `ABV median`
##    <chr>    <dbl>         <dbl>        <dbl>      <dbl>        <dbl>
##  1 ME        17.4            44         61          1         0.051 
##  2 WV        19.1            42         57.5        2         0.062 
##  3 FL        22.5            34         55          3.5       0.057 
##  4 GA        16.2            47         55          3.5       0.055 
##  5 DE        NA              50         52          5         0.055 
##  6 NM        36.7             3         51          6         0.062 
##  7 NH        47.4             2         48.5        7         0.055 
##  8 DC        50.9             1         47.5        8         0.0625
##  9 NY        23.4            30         47          9         0.055 
## 10 AK        23.3            31         46         10         0.056 
## # … with 1 more variable: `ABV rank` <dbl>
#top 10 states with the most variance in IBU
top_ibu_sd<-beer_brewery_byState %>% 
  arrange(desc(IBU_sd)) %>% 
  select(State, IBU_sd, IBU_sd_rank, IBU_median, IBU_rank, ABV_median, ABV_rank) %>% 
  rename_all(funs(str_replace_all(., "_", " "))) %>% 
  head(10)

top_ibu_sd
## # A tibble: 10 x 7
##    State `IBU sd` `IBU sd rank` `IBU median` `IBU rank` `ABV median`
##    <chr>    <dbl>         <dbl>        <dbl>      <dbl>        <dbl>
##  1 DC        50.9             1         47.5        8         0.0625
##  2 NH        47.4             2         48.5        7         0.055 
##  3 NM        36.7             3         51          6         0.062 
##  4 CT        36.1             4         29         41.5       0.06  
##  5 ID        35.5             5         39         21.5       0.0565
##  6 VT        34.4             6         30         38.5       0.055 
##  7 NJ        31.9             7         34.5       29         0.046 
##  8 OK        31.1             8         35         26.5       0.06  
##  9 KS        30.0             9         20         49         0.05  
## 10 PA        29.2            10         30         38.5       0.057 
## # … with 1 more variable: `ABV rank` <dbl>
#top 10 states with the least variance in IBU
bottom_ibu_sd<-beer_brewery_byState %>% 
  arrange(IBU_sd) %>% 
  select(State, IBU_sd, IBU_sd_rank, IBU_median, IBU_rank, ABV_median, ABV_rank) %>% 
  rename_all(funs(str_replace_all(., "_", " "))) %>% 
  head(10)

bottom_ibu_sd 
## # A tibble: 10 x 7
##    State `IBU sd` `IBU sd rank` `IBU median` `IBU rank` `ABV median`
##    <chr>    <dbl>         <dbl>        <dbl>      <dbl>        <dbl>
##  1 TN        15.9            48         37         24         0.0570
##  2 GA        16.2            47         55          3.5       0.055 
##  3 LA        16.4            46         31.5       35.5       0.052 
##  4 NE        17.2            45         35         26.5       0.056 
##  5 ME        17.4            44         61          1         0.051 
##  6 RI        17.9            43         24         44.5       0.055 
##  7 WV        19.1            42         57.5        2         0.062 
##  8 WA        19.9            41         38         23         0.0555
##  9 WY        20.4            40         21         47         0.05  
## 10 MT        20.5            39         40         18.5       0.055 
## # … with 1 more variable: `ABV rank` <dbl>

Summary statistics for the IBU variable.

summary(beerData$IBU)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    4.00   21.00   35.00   42.71   64.00  138.00    1005
ggplot(beerData, aes(x=IBU)) +
     geom_histogram(position="identity", alpha=0.5)+
     geom_vline(data=beerData,aes(xintercept=median(beerData$IBU)),linetype="dashed")+
     labs(title="International Bitterness Unit(IBU) Histogram",x="IBU", y = "Count") + 
     geom_vline(aes(xintercept = median(IBU, na.rm=T)),linetype="dotted", color="black") + annotate("text", x = c(33), y = c(120), label = c(paste("Median", median(beerData$IBU, na.rm=T))) , color="black", size=5 , angle=90, fontface="bold")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1005 rows containing non-finite values (stat_bin).
## Warning: Removed 2410 rows containing missing values (geom_vline).

ggsave('Export/IBU_by_volume.jpg')
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1005 rows containing non-finite values (stat_bin).

## Warning: Removed 2410 rows containing missing values (geom_vline).

Summary statistics for the ABV variable.

summary(beerData$ABV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00100 0.05000 0.05600 0.05977 0.06700 0.12800      62
ggplot(beerData, aes(x=ABV)) +
     geom_histogram(position="identity", alpha=0.5)+
     geom_vline(data=beerData,aes(xintercept=median(beerData$ABV)),linetype="dashed")+
     labs(title="Alcohol By Volume(ABV) Histogram",x="ABV", y = "Count") + 
     geom_vline(aes(xintercept = median(ABV, na.rm=T)),linetype="dotted", color="black") + annotate("text", x = c(.053), y = c(150), label = c(paste("Median", median(beerData$ABV, na.rm=T))) , color="black", size=5 , angle=90, fontface="bold")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (stat_bin).
## Warning: Removed 2410 rows containing missing values (geom_vline).

ggsave('Export/abv_by_volume.jpg')
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (stat_bin).

## Warning: Removed 2410 rows containing missing values (geom_vline).
grid.arrange(
  beer_brewery_byState %>% 
    ggplot(aes(x=Number_of_beers, y=ABV_median)) + geom_point() + 
      labs(title="Number of Beers and ABV"),
  beer_brewery_byState %>% 
    ggplot(aes(x=Number_of_styles, y=ABV_median)) + geom_point() + 
      labs(title="Number of Styles and ABV"),
  beer_brewery_byState %>% 
    ggplot(aes(x=Number_of_breweries, y=ABV_median)) + geom_point() + 
      labs(title="Number of Breweries and ABV"),
  ncol=2, nrow=2) %>% 
  ggsave(filename="Export/beer_culture_ABV.jpg")
## Saving 7 x 5 in image

grid.arrange(
  beer_brewery_byState %>% 
    ggplot(aes(x=Number_of_beers, y=IBU_median)) + geom_point() + 
      labs(title="Number of Beers and IBU"),
  beer_brewery_byState %>% 
    ggplot(aes(x=Number_of_styles, y=IBU_median)) + geom_point() + 
      labs(title="Number of Styles and IBU"),
  beer_brewery_byState %>% 
    ggplot(aes(x=Number_of_breweries, y=IBU_median)) + geom_point() + 
      labs(title="Number of Breweries and IBU"),
  ncol=2, nrow=2) %>% 
  ggsave(filename="Export/beer_culture_IBU.jpg")
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).
## Saving 7 x 5 in image

This code produces a custom Powerpoint generating slides with data calculated and plotted above

#Read baseline PowerPoint file
my_pres <- read_pptx('Visuals/presentation.pptx') 
#view(layout_summary(my_pres))
#view(layout_properties(my_pres))

# States with the most brewries
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
  ph_with(value = "States With the Most Breweries", location = ph_location(left = 3, top = .5, width = 8, height = 1)) %>%
  ph_with_table_at(value = top_brewstate,
                   height = 5, width = 8, left = 3, top = 2,
                   last_row = FALSE, last_column = FALSE, first_row = TRUE)

# States with the least breweries
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
  ph_with(value = "States With the Least Breweries", location = ph_location(left = 3, top = .5, width = 8, height = 1)) %>%
  ph_with_table_at(value = bottom_brewstate,
                   height = 5, width = 8, left = 3, top = 2,
                   last_row = FALSE, last_column = FALSE, first_row = TRUE)

# How many NA's were found
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>% 
  ph_with(value = "What Data Was Missing?", location = ph_location(left = 3.5, top = .5, width = 8, height = 1)) %>%
  ph_with(value = missing_info, location = ph_location(left = 2.5, top = 3, width = 12, height = 1))
 
#General Beer Statistics Map
filename = "Export/general_statistics_map.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>% 
  ph_with(value = "General Beer Statistics Map", location = ph_location(left = 3, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 10, height = 6.5), 
          location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5)) 

#
filename = "Export/ABV_beers_map.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>% 
  ph_with(value = "ABV by State", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 10, height = 6.5), 
          location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5)) 

#
filename = "Export/IBU_beers_map.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>% 
  ph_with(value = "IBU by State", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 10, height = 6.5), 
          location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5)) 

#
filename = "Export/Ounces_data.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>% 
  ph_with(value = "Ounces by State", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 10, height = 6.5), 
          location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5)) 

#Compute alcohol by volume 
filename = "Export/abv_by_volume.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>% 
  ph_with(value = "", location = ph_location(left = 3, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 10, height = 6.5), 
          location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5)) 

#Compute bitterness by volume 
filename = "Export/ibu_by_volume.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>% 
  ph_with(value = "", location = ph_location(left = 3, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 10, height = 6.5), 
          location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5)) 

#Relationship between Ounces and IBU
filename = "Export/ounce_IBU.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
  ph_with(value = "Relationship between Ounces and IBU", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 10, height = 6.5), 
          location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5)) 

#Compute the relationship between IBU and ABV
filename = "Export/ibu_abv_scat_before.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
  ph_with(value = "Relationship Between ABV and IBU", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 10, height = 6.5), 
          location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5)) 

#Compute the relationship between IBU and ABV
filename = "Export/ibu_abv_scat.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
  ph_with(value = "Relationship Between ABV and IBU", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 10, height = 6.5), 
          location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5)) 

#Relationship between State's beer activity and ABV
filename = "Export/beer_culture_ABV.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
  ph_with(value = "Relationship between State's beer activity and ABV", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 10, height = 6.5), 
          location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5)) 

#Relationship between State's beer activity and IBU
filename = "Export/beer_culture_IBU.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
  ph_with(value = "Relationship between State's beer activity and IBU", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 10, height = 6.5), 
          location = ph_location(left = 0.5, top = 1, width = 12, height = 6.5)) 

#Summary slide
filename = "Visuals/summary.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
  ph_with(value = "Summary and Opportunities", location = ph_location(left = 2.5, top = .5, width = 10, height = 1)) %>%
  ph_with(external_img(src = filename, width = 13, height = 7.5),
          location = ph_location(left = .15, top = .15, width = 13, height = 7.5))
#Repository slide
filename = "Visuals/repository.jpg"
my_pres <- add_slide(x=my_pres, layout = "Title and Content", master = "Office Theme") %>%
  ph_with(external_img(src = filename, width = 13, height = 7.5),
          location = ph_location(left = .15, top = .15, width = 13, height = 7.5))

# Print to save powerpoint
extension <- format(Sys.time(),'%b%d%H%M%S')
print(my_pres, target = paste("anheuser-budwieser", extension, "v1.pptx", sep='') )
## [1] "/Users/bjholmes/Library/Mobile Documents/com~apple~CloudDocs/DataScience/Doing Data Science/Unit_7/case_study_1/anheuser-budwieserJun27232438v1.pptx"